package com.tensquare.articlecrawler.processor;
/*
 *ClassName:ArticleProcessor
 *Description:文章爬取类
 *@author:可爱的大鸭子
 */

import org.springframework.stereotype.Component;
import us.codecraft.webmagic.Page;
import us.codecraft.webmagic.Site;
import us.codecraft.webmagic.processor.PageProcessor;

@Component
public class ArticleProcessor implements PageProcessor{
    @Override
    public void process(Page page) {
        page.addTargetRequests(page.getHtml().links().regex("https://blog.csdn.net/[a-z A-Z 0-9 -]+/article/details/[0-9]{9}").all());
        String title= page.getHtml().xpath("//*[@id=\"mainBox\"]/main/div[1]/div/div/div[1]/h1").get();
        String content= page.getHtml().xpath("//*[@id=\"content_views\"]").get();

        //剔除标题的h1标签
        String htmlRegex="<[^>]+>";
        title.replaceAll(htmlRegex,"");

        //获取页面需要的内容
        System.out.println("标题："+title);
        System.out.println("内容："+content);
        if(title!=null && content!=null){ //如果有标题和内容
            page.putField("title",title);
            page.putField("content",content);
        }else{
            System.out.println("都是空的！");
            page.setSkip(true);//跳过
        }
    }

    @Override
    public Site getSite() {
        return Site.me().setRetryTimes(5000).setSleepTime(100);
    }
}
